Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages ------------------------------------------------------------------------ tidyverse 1.3.1 --
v tibble 3.1.1 v dplyr 1.0.5
v tidyr 1.1.3 v stringr 1.4.0
v readr 1.4.0 v forcats 0.5.1
v purrr 0.3.4
-- Conflicts --------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
Attaching package: 㤼㸱lubridate㤼㸲
The following objects are masked from 㤼㸱package:base㤼㸲:
date, intersect, setdiff, union
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Loading required package: Matrix
Attaching package: 㤼㸱Matrix㤼㸲
The following objects are masked from 㤼㸱package:tidyr㤼㸲:
expand, pack, unpack
Attaching package: 㤼㸱arules㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
recode
The following objects are masked from 㤼㸱package:base㤼㸲:
abbreviate, write
df_artist <- read.csv("data/df_artist_sin_duplicados.csv")
df_charts_raw <- read.csv("data/df_charts_sin_duplicados.csv")
df_audio_features_raw <- read.csv("data/audio_features_plano_sin_duplicados.csv")
df_lyrics <- read.csv("data/df_lyrics.csv")
# DF listo para el join con chrats
df_audio_features <- df_audio_features_raw %>%
group_by(track_name, external_urls_spotify) %>%
mutate(artist_all = paste(artist_name, collapse = ",|,")) %>%
ungroup() %>%
mutate(artist_key = sub(",|,.*", "", artist_all)) %>%
dplyr::select(artist_name, artist_all, artist_key, everything(.)) %>%
distinct(artist_key, external_urls_spotify, .keep_all = T) %>%
as.data.frame()
cant_marketscontar_market <- function(x){
q <- length(unlist(strsplit(x, split = ",")))
return (q)
}
df_audio_features$cant_markets <- sapply(df_audio_features[,"markets_concat"], contar_market)
#metrica de popularidad
df_charts <- df_charts_raw %>%
group_by(Artist, Track_Name, URL) %>%
dplyr:: summarise(semanas_sum = n(),
streams_sum = (sum(Streams, na.rm = T)/10^6 ),
streams_min = (min(Streams)/10^6 ),
streams_max = (max(Streams)/10^6 ),
position_avg = mean(Position, na.rm = T),
position_min = min(Position),
position_max = max(Position)) %>%
ungroup() %>%
mutate(popularidad = as.numeric(streams_sum*semanas_sum/position_avg) )
`summarise()` has grouped output by 'Artist', 'Track_Name'. You can override using the `.groups` argument.
groupping_cols <- c("artist_name","artist_all","artist_key",
"track_name","external_urls_spotify","album_name","album_release_year")
numeric_col_charts <- c("Position","Streams")
week_start <- c("week_start")
chart_group <- join_audio_charts %>%
group_by(artist_name,artist_all,artist_key,track_name,
external_urls_spotify,album_name,album_release_year)
continuas_summarized = chart_group %>% summarise_at(features_continuas, mean, na.rm = TRUE)
categoricas_summarizes = chart_group %>% summarise_at(features_categoricas, first)
numeric_charts_summarizes = chart_group %>% summarise(across(numeric_col_charts,
list(min=min,max=max,avg=mean)))
cant_semanas = chart_group %>% summarise_at(week_start, n_distinct)
names(cant_semanas$week_start) <- "cant_semanas"
aggregation_df <- cbind(numeric_charts_summarizes,
cant_semanas[,-c(1:7)],continuas_summarized[,-c(1:7)],
categoricas_summarizes[,-c(1:7)])
names(aggregation_df)[names(aggregation_df) == 'week_start'] <- "cant_semanas"
cols <- names(aggregation_df)
numeric_cols <- cols[sapply(aggregation_df,is.numeric)]
summary(aggregation_df[,numeric_cols[2:length(numeric_cols)]])
audio_features Y chartsmd.pattern(join_audio_charts, rotate.names = TRUE)
artist_key track_name external_urls_spotify semanas_sum streams_sum streams_min
1975 1 1 1 1 1 1
2 1 1 1 1 1 1
1326 1 1 1 1 1 1
0 0 0 0 0 0
streams_max position_avg position_min position_max popularidad artist_name
1975 1 1 1 1 1 1
2 1 1 1 1 1 1
1326 1 1 1 1 1 0
0 0 0 0 0 1326
artist_all album_name acousticness danceability duration_ms energy instrumentalness
1975 1 1 1 1 1 1 1
2 1 1 1 1 1 1 1
1326 0 0 0 0 0 0 0
1326 1326 1326 1326 1326 1326 1326
liveness loudness speechiness tempo valence cant_markets explicit key_name
1975 1 1 1 1 1 1 1 1
2 1 1 1 1 1 1 1 1
1326 0 0 0 0 0 0 0 0
1326 1326 1326 1326 1326 1326 1326 1326
mode_name key_mode album_release_year
1975 1 1 1 0
2 1 1 0 1
1326 0 0 0 19
1326 1326 1328 25196
##histograma de las variables continuas de audio_features
for (i in features_continuas){
hist(df_audio_features[,i], main = paste("Histograma de", i, "(all data)"), xlab = i)
abline(v = mean(df_audio_features[,i], na.rm = TRUE) , col="red")
abline(v = median(df_audio_features[,i], na.rm = TRUE) , col="blue")
legend("topright", legend = c("Media", "Mediana"), col=c("red", "blue"), lty =1)
}
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
##histograma de las variables continuas de charts
for (i in c(features_continuas)){
hist(join_audio_charts[,i], main = paste("Histograma de", i, "(charts)"), xlab = i)
abline(v = mean(join_audio_charts[,i], na.rm = TRUE) , col="red")
abline(v = median(join_audio_charts[,i], na.rm = TRUE) , col="blue")
}
#divido features de charts según su distribución
audio_charts_continuas_media <- c('duration_ms', 'valence')
audio_charts_continuas_mediana <- c('danceability', 'acousticness', 'tempo', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets', "Streams")
##medidas resumen y barplots de las variables categoricas audio_features
for(i in features_categoricas){
barplot(sort(table(df_audio_features[,i]),decreasing = T), las=2,
main = paste("Barplot de", i, "(all data)"))
# pie(table(df_features_categoricos[,i]))
}
##medidas resumen y barplots de las variables categoricas join_audio_charts
for(i in features_categoricas){
barplot(table(join_audio_charts[,i]), las=2,
main = paste("Barplot de", i, "(charts)")
)
# pie(table(df_features_categoricos[,i]))
}
par(mfrow=c(4,3))
for (feature in features_continuas){
boxplot(df_audio_features[,feature], las=2, horizontal=T, main=feature)
}
Con excepción de valence el resto de las features poseían cierto sesgo. Se decidió transformar las variables que mayor sesgo poseían: duration_ms, instrumentalness, liveness, speechiness como método de corregir la distribución y achicar la cantidad de outliers. La variable loudness_reg_imp no fue modificada debido a que al ser negativa
#join entre variables transformadas y resto features
x <- df_audio_features %>%
select("artist_name","artist_all","artist_key",
"track_name", "external_urls_spotify", "album_name", "album_release_year",
all_of(features_continuas), all_of(features_categoricas)) %>%
select(!variables_sesgo)
join_audio_charts <- cbind(x, df_sesgadas_log_adjust) %>%
right_join( df_charts %>%
select( "Track_Name", "Artist",
"URL","Position", "Streams", "week_start", "week_end"),
by = c("track_name" = "Track_Name",
"artist_key" ="Artist",
"external_urls_spotify" = "URL"))
variables_plot <- unlist(strsplit("duration_ms", ","))
variables_plot <- append(variables_plot, paste(variables_plot,"_log", sep=""))
variables_plot <- variables_plot[order(variables_plot)]
plotear <- merged[,variables_plot]
par(mfrow = c(1,2))
for (col in names(plotear)){
hist(plotear[,col], breaks="FD", main=col, xlab="")
}
transformacion <- c('instrumentalness','loudness','liveness','speechiness', 'duration_ms')
logaritmo_ajustado = function(x,delta){
if (x<=0.0){
return(log(0.00+delta, base = 10))
}else{
return(log(x, base = 10))
}
}
delta <- 10^(-6)
par(mfrow=c(2,5))
for (feature in transformacion){
hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
hist(unlist(lapply(df_audio_features[,feature], function(x) logaritmo_ajustado(x,delta))), main=paste(feature,"log", sep="_"))
}
inv_sqrt_ajustada = function(x, delta){
if (x==0.0){
return(1/sqrt(x+delta))
}else{
return(1/sqrt(x))
}
}
delta <- 10^(-6)
par(mfrow=c(2,5))
for (feature in transformacion){
hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
hist(unlist(lapply(df_audio_features[,feature], function(x) inv_sqrt_ajustada(x,delta))), main=paste(feature,"inv_sqt", sep="_"))
}
par(mfrow=c(2,5))
for (feature in transformacion){
hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
hist(sqrt(df_audio_features[,feature]), main=paste(feature,"sqrt", sep="_"))
}
par(mfrow = c(2,1))
hist(df_audio_features[,'loudness_reg_imp'], main='loudness', xlab="")
#hist(sqrt(df_audio_features[,'loudness_reg_imp']), main= 'loudness_sqrt', xlab="")
boxplot(df_audio_features[,'loudness_reg_imp'], horizontal = T)
#boxplot(sqrt(df_audio_features[,'loudness_reg_imp']), horizontal = T)
fit <- lm(loudness~energy+acousticness, data=df_audio_features)
modelo <- fit$coefficients
df_audio_features$loudness_reg_imp <- df_audio_features$loudness
X <- df_audio_features[df_audio_features$loudness>0, c('energy', "acousticness")]
df_audio_features$loudness_reg_imp[df_audio_features$loudness>0] <- modelo[1]+modelo[2]*X[,1]+modelo[3]*X[,2]
summary(df_audio_features[,c("loudness", "loudness_reg_imp")])
summary(fit)
instrumentalness tiene mucho sesgo la variable. Se va a recurrir a una logaritmización de la variable, previa transformación del dominio, haciendo que los valores que son 0, sean en realidad 0.0000001
logaritmo_ajustado = function(x,delta){
if (x==0.0){
return(log(x+delta, base = 10))
}else{
return(log(x, base = 10))
}
}
delta <- 10^(-6)
df_audio_features$instrumentalness_logadjust <- unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta)))
par(mfrow =c(2,2))
hist(df_audio_features$instrumentalness, main="insrumentalness", xlab="")
hist(unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta))), main='instrumentalness_logadjust', ylim = c(0,130500), xlab = "")
boxplot(df_audio_features$instrumentalness, main="", horizontal = T)
boxplot(unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta))), main="", horizontal=T)
# hist(log(1/sqrt(df_audio_features$instrumentalness+0.00001)),main='log(sqrt(x+))', ylim=c(0,130500), xlab = "")
¿Es útil esta transformación?
delta <- 10^(-6)
df_audio_features$instrumentalness_logadjust <- unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta)))
df_chart_tojoin <- df_charts[,c("Track_Name", "Artist", "URL")]
df_chart_tojoin$isinchart <- 1
df_audio_features_tojoin <- df_audio_features[, c("track_name","artist_key","external_urls_spotify","instrumentalness", "instrumentalness_logadjust")]
join_histogram <- df_audio_features_tojoin %>%
dplyr::select("track_name","artist_key","external_urls_spotify","instrumentalness", "instrumentalness_logadjust") %>%
left_join( df_chart_tojoin %>%
select("Track_Name", "Artist", "URL","isinchart"),
by = c(
"track_name" = "Track_Name",
"artist_key" ="Artist",
"external_urls_spotify" = "URL"))
join_histogram$isinchart[is.na(join_histogram$isinchart)] <- 0
join_histogram$isinchart <- factor(join_histogram$isinchart)
h11 <- hist(join_histogram[join_histogram$isinchart==1,'instrumentalness'])
h11$density <- h11$counts/sum(h11$counts)*100
h12 <- hist(join_histogram[join_histogram$isinchart==0,'instrumentalness'])
h12$density <- h12$counts/sum(h12$counts)*100
h21 <- hist(join_histogram[join_histogram$isinchart==1,'instrumentalness_logadjust'])
h21$density <- h21$counts/sum(h21$counts)*100
h22 <- hist(join_histogram[join_histogram$isinchart==0,'instrumentalness_logadjust'])
h22$density <- h22$counts/sum(h22$counts)*100
#png("C:/Users/Asus/Desktop/DATA SCIENCE/MAESTRIA/Data Mining/TP/graficos/instrumentalness.png",
# width = 800, height = 800)
par(mfrow = c(3,2))
plot(h11, main='instrumentalness \nchart', xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h12, main='instrumentalness \nfuera chart', xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h21, main ="instrumentalness_log \nchart", xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h22, main ="instrumentalness_log \nfuera chart", xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
boxplot(join_histogram[join_histogram$isinchart==1,'instrumentalness_logadjust'], main="instrumentalness_log chart", horizontal = T)
boxplot(join_histogram[join_histogram$isinchart==0,'instrumentalness_logadjust'], main="instrumentalness_log fuera chart", horizontal = T)
#dev.off()
################################
## FILTRAMOS OUTLIERS POR Z-SCORE para 'danceability', 'tempo', 'valence'
##############################
#z-score para variables que tienden a la normal
#filtro features numericos
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
df_audio_features_zscore_media <- df_audio_features[,features_continuas_media]
#normalizo z score con las variables que tienden a la normal
zscore_cols <- c()
for(col in names(df_audio_features_zscore_media)){
name_col <- paste('zscore_',col, sep = "")
zscore_cols <- append(zscore_cols, name_col)
media <- mean(df_audio_features_zscore_media[,col])
stdv <- sd(df_audio_features_zscore_media[,col])
df_audio_features_zscore_media[,name_col] <- (df_audio_features_zscore_media[,col] - media)/stdv
}
par(mfrow=c(1,length(zscore_cols)))
lapply(zscore_cols, function(col) boxplot(df_audio_features_zscore_media[,col],xlab=col))
Danceability
#variable: danceability
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_danceability> umbral_zscore) | (df_audio_features_zscore_media$zscore_danceability< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, danceability ) %>%
arrange(-danceability)
Tempo
#variable: Tempo
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_tempo> umbral_zscore) | (df_audio_features_zscore_media$zscore_tempo< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, tempo ) %>%
arrange(-tempo)
Valence
#variable: valence
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_valence> umbral_zscore) | (df_audio_features_zscore_media$zscore_valence< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, valence ) %>%
arrange(-valence)
################################
## FILTRAMOS OUTLIERS POR Z-SCORE MODIFICADO para 'acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets'
##############################
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
df_audio_features_zscore_mediana <- df_audio_features[,features_continuas_mediana]
zscoremodif_cols <- c()
for(col in names(df_audio_features_zscore_mediana)){
name_col <- paste('zscoremodif_',col, sep = "")
zscoremodif_cols <- append(zscoremodif_cols, name_col)
med = median(df_audio_features_zscore_mediana[,col], na.rm = T)
MAD = median(abs(df_audio_features_zscore_mediana[,col] - med), na.rm = T)
df_audio_features_zscore_mediana[, name_col] <- 0.6745 * (df_audio_features_zscore_mediana[,col] - med) / MAD
}
par(mfrow=c(4,2))
lapply(zscoremodif_cols, function(col) boxplot(df_audio_features_zscore_mediana[,col],xlab=col, horizontal = T))
Revisión Variable Instrumentalness
instrumentalness <- c("instrumentalness", "zscoremodif_instrumentalness")
x <- df_audio_features$instrumentalness
n_interv <- 10
intervalos <- round(seq(0,max(x),by=(max(x)-min(x))/n_interv),2)
labs <- c()
for (i in 1:n_interv){
lab <- paste(intervalos[i],intervalos[i+1], sep='\n')
labs <- append(labs, lab)
}
bins <- cut(x, n_interv, include.lowest = TRUE, labels = labs)
barplot(table(bins))
Hacemos K-means para poder discretizar la variable.
sse <- c()
for (k in 2:6){
clusters <- kmeans(df_audio_features$instrumentalness,centers = k, iter.max = 10, nstart = k)
sse <- append(sse, clusters$tot.withinss)
}
plot(2:6,sse, type = 'l', xlab='Cantidad de Clusters', ylab='Suma Error Cuadrático')
#k=3
clusters3 <- kmeans(df_audio_features$instrumentalness,centers = 3, iter.max = 10, nstart = 3)
df_audio_features$clusters <- factor(clusters3$cluster)
lev <- levels(df_audio_features$clusters)
labs <- c()
for (i in lev){
min <- min(df_audio_features$instrumentalness[df_audio_features$clusters==i])
max <- max(df_audio_features$instrumentalness[df_audio_features$clusters==i])
lab <- paste(min,max, sep=' - ')
labs <- append(labs, lab)
}
labs
# barplot(table(factor(clusters3$cluster)), labels = labs)
#prueba igal de transformacion y test de normalidad
join_audio_charts[1:5,"acousticness"]^2
library(nortest)
log10(df_chart_w_lyrics$acousticness)
for (i in features_continuas){
x <- log10(df_chart_w_lyrics[,i])
x <- shapiro.test(x)
z <- x$p.value
print(z)
}
# lyrics = mongo(collection = "lyrics", db = "spotify_dm" )
# df_lyrics <- lyrics$find('{}')
#
# write.csv(df_lyrics, "data/df_lyrics.csv")
df_lyrics <- read.csv("data/df_lyrics.csv") %>%
select(-X)
df_lyrics_unicas <- df_lyrics %>%
distinct(artist_name, track_name, lyrics)
#filtro de idioma
spa_lyrics = df_lyrics_unicas[textcat(df_lyrics_unicas$lyrics)=="spanish",]
spa_lyrics
en_lyrics = df_lyrics_unicas[textcat(df_lyrics_unicas$lyrics) %in% c("english", "scots"),]
en_lyrics
#chequeo cantidad de canciones por idioma
100*(nrow(en_lyrics) + nrow(spa_lyrics))/nrow(df_lyrics_unicas)
# tabla contingencia de idiomas
idiomas = textcat(df_lyrics_unicas$lyrics)
# sort(table(idiomas), decreasing = T)
# comentar y descomentar según se elija un dataframe u otro
# df_lyrics_seleccionado = df_lyrics_unicas
df_lyrics_seleccionado = en_lyrics
corpus = Corpus(VectorSource(enc2utf8(df_lyrics_seleccionado$lyrics)))
# Eliminamos espacios
corpus.pro <- tm_map(corpus, stripWhitespace)
inspect(corpus.pro[1])
# Elimino todo lo que aparece antes del primer []
corpus.pro <- tm_map(corpus.pro, content_transformer(
function(x) sub('^.+?\\[.*?\\]',"", x)))
# inspect(corpus.pro[1])
# Elimino las aclaraciones en las canciones, por ejemplo:
# [Verso 1: Luis Fonsi & Daddy Yankee]
corpus.pro <- tm_map(corpus.pro, content_transformer(
function(x) gsub('\\[.*?\\]', '', x)))
# Elimino todo lo que aparece luego de 'More on Genius'
corpus.pro <- tm_map(corpus.pro, content_transformer(function(x) gsub("More on Genius.*","", x)))
# Convertimos el texto a minúsculas
corpus.pro <- tm_map(corpus.pro, content_transformer(tolower))
# removemos números
corpus.pro <- tm_map(corpus.pro, removeNumbers)
# Podemos agregar palabras a las stopwords
# my_stopwords <- append(stopwords("spanish"), 'palabra')
my_stopwords <- append(stopwords("english"), c('yeah', "aint", "get", "got"))
# Removemos palabras vacias
corpus.pro <- tm_map(corpus.pro, removeWords, stopwords("english"))
corpus.pro <- tm_map(corpus.pro, removeWords, my_stopwords)
# corpus.pro <- tm_map(corpus.pro, removeWords, stopwords("spanish"))
# inspect(corpus.pro[1])
# Removemos puntuaciones
corpus.pro <- tm_map(corpus.pro, removePunctuation)
# Removemos todo lo que no es alfanumérico
corpus.pro <- tm_map(corpus.pro, content_transformer(function(x) str_replace_all(x, "[[:punct:]]", " ")))
# En tm_map podemos utilizar funciones prop
library(stringi)
replaceAcentos <- function(x) {stri_trans_general(x, "Latin-ASCII")}
corpus.pro <- tm_map(corpus.pro, replaceAcentos)
# Eliminamos espacios que se van generando con los reemplazos
corpus.pro <- tm_map(corpus.pro, stripWhitespace)
#funciones
#funcion para corregir palabras
decontracted = function(txt){
txt = gsub("won't", "will not", txt)
txt = gsub("\\'s", " is", txt)
txt = gsub("\\'t", " not", txt)
txt = gsub("\\'ll", " will", txt)
txt = gsub("\\'m", " am", txt)
txt = gsub("\\'re", " are", txt)
txt = gsub("\\'d", " had", txt)
txt = gsub("\\'ve", " have", txt)
txt = gsub("couldn", "could", txt)
txt = gsub("don", "do", txt)
txt = gsub("doesn", "does", txt)
txt = gsub("isn", "is", txt)
txt = gsub("mustn", "must", txt)
txt = gsub("shouldn", "should", txt)
txt = gsub("wasn", "was", txt)
txt = gsub("\\'cause", " because", txt)
txt = gsub("\\'", "g", txt)
return(txt)
}
#Función para limpiar.
text_cleaning = function(txt, stop=FALSE, language){
txt = sub('^.+?\\[.*?\\]',"", txt) #ok
txt = sub("More on Genius.*","", txt)
txt = gsub('\\[.*?\\]', '', txt)
txt = gsub("\\n"," ", txt)
txt = gsub("[()]", " ", txt)
txt = tolower(txt)
txt = decontracted(txt)
txt = gsub("\\W+\\b", " ", txt)
txt = gsub("\\d", " ", txt)
stopwords_regex = paste(stopwords('en'), collapse = '\\b|\\b')
stopwords_regex = paste0('\\b', stopwords_regex, '\\b')
txt = stringr::str_replace_all(txt, stopwords_regex, '')
my_stopwords <- c('ooh', 'yeah', "aint", "get", "got", "ayy")
txt = stringr::str_replace_all(txt, my_stopwords, '')
txt = str_trim(txt)
txt = gsub("\\n"," ", txt)
if(language == "en"){
return(txt)
}else if (language == "es"){
txt <- function(x) {stri_trans_general(x, "Latin-ASCII")}
return(txt)
}else{
return("Falta definir lenguaje")
}
}
#función para obtener oraciones de una sola palabra.
one_word_setences = function(txt){
return(gsub("\\W+\\b", ". ", txt))
}
#limpio las letras en ingles
en_lyrics$lyrics = text_cleaning(en_lyrics$lyrics, language = "en")
head(en_lyrics$lyrics, 1)
#Diccionario español
malas_palabras_1 <- read_csv("data/malas_palabras.txt",
col_names = FALSE)
malas_palabras_2 <- read_csv("data/malas_palabras_translate.txt",
col_names = FALSE)
malas_palabras_3 <- read_csv("data/malas_palabras_wiki.txt",
col_names = FALSE) %>%
select(X1)
malas_palabras_4 <- read_csv("data/palabras_profanas_es.txt",
col_names = FALSE)
malas_palabras <- rbind(malas_palabras_1, malas_palabras_2,
malas_palabras_3, malas_palabras_4)
#Función para limpiar.
text_cleaning_esp = function(txt, stop=FALSE){
txt = sub('^.+?\\[.*?\\]',"", txt) #ok
txt = sub("More on Genius.*","", txt)
txt = gsub('\\[.*?\\]', '', txt)
txt = gsub("\\n"," ", txt)
txt = gsub("[()]", " ", txt)
txt = tolower(txt)
# txt = decontracted(txt)
txt = gsub("\\W+\\b", " ", txt)
txt = gsub("\\d", " ", txt)
txt = str_trim(txt)
# txt = stri_trans_general(txt, "Latin-ASCII")
return(txt)
}
malas_palabras$limpias = text_cleaning(malas_palabras$X1)
malas_palabras
malas_palabras %>% filter(startsWith(limpias, "g"))
#Genero lista de malas palabras
bad_words <- c()
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_zac_anger)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_alvarez)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_arr_bad)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_racist)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_banned)))
bad_words <- unique(bad_words)
biglou <- read.csv("https://www.cs.cmu.edu/~biglou/resources/bad-words.txt", header=FALSE, col.names = c("words"))
#Función para obtener palabras profanas de cada lyric
get_profanities = function(txt, profanity_lst){
# txt = text_cleaning(txt)
words = as.data.frame(strsplit(txt, "[ ]+"), col.names = "words")
profan_df = profanity(get_sentences(words), profanity_list = profanity_lst)
profan_words = profan_df[profan_df$profanity_count!=0,]$words
vector = as.vector(profan_words)
if (length(vector)==0){
return(NULL)
}
else{return(as.vector(profan_words))
}
}
en_lyrics$profabe_biglou <- lapply(en_lyrics$lyrics, function(x) get_profanities(x, biglou$words))
en_lyrics %>%
mutate(profane_biglou = unlist(get_profanities(lyrics, biglou$words)))
en_lyrics$profabe_biglou = unlist(strsplit(en_lyrics$profabe_biglou, split = " "))
en_lyrics$profabe_badwords <- lapply(en_lyrics$lyrics, function(x) get_profanities(x, bad_words))
str(en_lyrics)
head(en_lyrics,1)
en_lyrics$profabe_biglou[3]
####################################################################
####### Generación de la Matríz Término-Documento del corpus #######
####################################################################
corpus.pro2tdm <- function(corpus, ponderacion, n_terms){
#corpus
#matriz TD
dtm <- TermDocumentMatrix(corpus,
control = list(weighting = ponderacion))
matriz_td <- as.matrix(dtm)
# Calculamos la frecuencia de cada término en el corpus
freq_term <- head(sort(rowSums(matriz_td),decreasing=TRUE), n_terms)
#matriz transpuesta de los n_terms mas frecuentes
matriz_nf <- t(matriz_td[sort(names(freq_term)), ])
#pasaje a binario
matriz_nf[matriz_nf>0] <- 1
return(matriz_nf)
}
corpus_eng = Corpus(VectorSource(enc2utf8(en_lyrics$lyrics)))
matriz <- corpus.pro2tdm(corpus = corpus_eng, ponderacion= "weightTf",n_terms= 150)
dim(matriz)
df_tm <- as.data.frame(matriz)
head(df_tm,2)
## Join matriz de palabras con artista y track
df_ly_feat <- cbind(df_lyrics_seleccionado[-c(3)], df_tm)
nrow(df_tm)
nrow(df_lyrics_seleccionado)
nrow(df_ly_feat)
filter <- !names(df_ly_feat) %in% c("artist_name", "track_name" )
df_ly_feat_ok <- df_ly_feat[, filter]
# df_ly_feat_ok = df_ly_feat_ok[, -(which(colSums(df_ly_feat_ok) == 0))]
# colSums(df_ly_feat_ok)
head(df_ly_feat_ok, 3)
head(df_ly_feat, 3)
df_ly_feat$id = 1:nrow(df_ly_feat)
df_melt <- reshape2::melt(data = df_ly_feat[,3:ncol(df_ly_feat)], id.vars = c("id")) %>%
arrange(id)
df_melt <- df_melt[df_melt$value != 0,]
df_melt_txt <- df_melt[df_melt$value == 1,]
df_melt_cat <- df_melt[df_melt$value != 1,]
head(df_melt_txt )
dim(df_melt_txt )
#denomino a los términos profanos
df_melt_txt <- df_melt_txt %>%
mutate(variable = case_when(as.character(variable) %in% biglou$words ~
paste0("PROF_", as.character(variable)),
T ~ paste0("TERM_", as.character(variable))
)
)
df_melt_txt %>% filter(startsWith(variable, "PROF"))
# df_melt_txt[df_melt_txt$variable %in% biglou$words,]
df_melt_txt_to_ruls <- df_melt_txt[, -c(3)]
names(df_melt_txt_to_ruls) <- c("id", "item")
write.table(df_melt_txt_to_ruls, file="data/transaccions_lyrics_features.txt", row.names = F)
# Reglas
# chequear nan's
lyrics_trans <- read.transactions("data/transaccions_lyrics_features.txt", format = "single", cols = c(1,2))
arules::inspect(head(lyrics_trans, 3))
summary(lyrics_trans)
reglas <- apriori(lyrics_trans, parameter = list(support=0.1,
confidence = 0.5, target = "rules" ))
reglas_sub <- subset(reglas, subset = rhs %pin% "PROF_")
arules::inspect(head(sort(reglas_sub, by = "lift", decreasing = T),5))
bad_words <- c()
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_zac_anger)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_alvarez)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_arr_bad)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_racist)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_banned)))
bad_words <- unique(bad_words)
contar_bad_words <- function(x){
x <- profanity(x,profanity_list = bad_words)
q <- sum(x$profanity_count)
return (q)
}
df_chart_w_lyrics$cant_bad_words <- sapply(df_chart_w_lyrics[,"lyrics"], contar_bad_words)
df_chart_w_lyrics_only_explicit <- df_chart_w_lyrics[df_chart_w_lyrics$explicit==TRUE & df_chart_w_lyrics$cant_bad_words > 0, ]
hist(df_chart_w_lyrics_only_explicit$cant_bad_words)
#creo vars categóricas
df_chart_w_lyrics_only_explicit$nivel_puteada <- cut(df_chart_w_lyrics_only_explicit$cant_bad_words, breaks = c(0,10,20,50,Inf), labels=c("bajo","poco","alto","muy_alto"))
df_chart_w_lyrics_only_explicit$nivel_ranking <- cut(df_chart_w_lyrics_only_explicit$position_avg, breaks = c(1,100,Inf), labels=c("1a100","100a200"))
df_chart_w_lyrics_only_explicit$nivel_popularidad <- cut(sqrt(df_chart_w_lyrics_only_explicit$cant_bad_words), breaks = c(0,10,20,50,Inf), labels=c("bajo","poco","alto","muy_alto"))
transactions <- as(as.data.frame(apply(df_chart_w_lyrics_only_explicit, 2, as.factor)), "transactions")
rules = apriori(transactions, parameter=list(target="rules", confidence=0.25, support=0.1))
rules.sub <- subset(rules, subset = lhs %pin% "nivel_puteada" & rhs %pin% "nivel_ranking")
inspect(head(sort(rules.sub, by = "lift", decreasing = TRUE),10))
# discretizacion continuas y seleccion de variables
# identificar palabras explít
¿Qué características tienen las canciones que están en el chart? ¿Cual es el patrón comun que tienen las canciones más escuchadas? (ver dispersiones, media, grafico comparativo)
#funcion para escalar variable
scale_vble <- function(x){
(x - mean(x, na.rm = T))/sd(x, na.rm = T)
}
#anti_join
anti_join_audio_charts <- df_audio_features %>%
select("artist_name","artist_all", "artist_key",
"track_name", "external_urls_spotify", "album_name", "album_release_year",
all_of(features_continuas), all_of(features_categoricas)) %>%
anti_join( df_charts %>%
select( "Track_Name", "Artist", "URL"),
by = c("external_urls_spotify" ="URL",
"artist_key" ="Artist" ))
# by = c("track_name" = "Track_Name"))
anti_join_audio_charts_complete <- na.omit(anti_join_audio_charts)
anti_join_audio_charts_complete_scale <- anti_join_audio_charts_complete %>%
distinct() %>%
select(features_continuas) %>%
mutate_all(scale_vble)
nrow(anti_join_audio_charts_complete_scale)
join_audio_charts %>%
group_by(artist_name) %>%
dplyr::summarise(n = n()) %>%
arrange(-n)
join_audio_charts %>%
group_by(track_name, artist_name,external_urls_spotify) %>%
dplyr::summarise(n = n()) %>%
arrange(-n) %>%
select(track_name, n, everything(.))
# cantidad de semanas que estuvieron en el chart
df_charts %>%
mutate(week_start=as.Date(week_start),
week_end = as.Date(week_end),
week_year = (year(week_start))) %>%
arrange(Artist, Track_Name) %>%
group_by(Artist, Track_Name, URL) %>%
dplyr:: summarise( day_in = min(week_start),
year_in = year(day_in),
day_max = max(week_end),
year_max = year(day_max),
duracion_chart_dias = day_max-day_in,
duracion_chart_anio = year_max - year_in) %>%
arrange(Artist)